# Codes used to estimate the narrative model for the U.S. Congressional Record and
# output the associated dataframes for downstream empirical analysis

# The final dataframes contain useful metadata, such as:
# - speech ids
# - speaker information
# - original sentences
# - original semantic role statements
# - associated narratives
# - sentence sentiment scores
# - speech topic shares

# The codes were run on an HPC cluster

# Before running these scripts, please consider the README!

# This bash file provides a step-by-step account of our implementation for replication
# Caveats:
# /!\ In principle, this bash file runs as is (with the exception of the human validation that requires human input)
# However, we heavily parallelized our computations on an HPC cluster (see comments below)

# Computational requirements:
# 35GB of RAM per core (except for summary_statistics.py which requires 80GB)
# Computing times (if *not* parallelized):
# For our main model (1000 clusters): approximately 85 days
# In addition, if more models (also 100 / 500 / 2000 clusters) are built for the human validation: 45 days (total of 45 + 85 = 130 days)

source venv/bin/activate

cd ./code

# We provide our seed dataset in /data/gpo_sentences/*.csv

# Annotate semantic roles
# Approximately 60 days of computing time (w/o parallelization)
# The resulting json-files are provided in /data/gpo_srl_annotations/*.json
# Since we parallelized the annotations into 29 batches, run_srl.py requires a system argument for the batch_id
# To run the scripts sequentially, type:
for batch_id in `seq 0 28`; do python3 run_srl.py $batch_id; done
# To run the scripts on a cluster computer, follow the job submission guidelines of your cluster; e.g.,
# bsub -J "srl" -W 1380 -n 1 -R "rusage[mem=35000]" python3 run_srl.py batch_id

# Build a narrative model on a batch of 50,000 randomly drawn speeches
# Our script takes batch 0 (same batching as for SRL above) to train the clustering model
# Approximately 2 days of computing time per narrative model.
# build_narrative_model.py is deterministic on a given machine but may produce slightly different clusters between machines
# This script requires a system argument for the number of clusters
# For the main part of the paper, we specified number_of_clusters = 1000
python3 build_narrative_model.py 1000

# While the paper is based on 1000 clusters, we specified number_of_clusters = 100 / 500 / 1000 / 2000 for our human validation exercise
# python3 build_narrative_model.py 100
# python3 build_narrative_model.py 500
# python3 build_narrative_model.py 2000

# Predict the narratives for all batches
# get_narratives.py requires two system arguments:
# We parallelized by batch (first argument) and by cluster number specification (second argument)
# Approximately 15 days of computing time per narrative model (i.e., per cluster number specification)
# The resulting csv-files are provided in /data/gpo_narratives/*.csv
# To run the scripts sequentially, type:
for batch_id in `seq 0 28`; do python3 get_narratives.py $batch_id 1000; done
# More generally, for other cluster number specifications
# python3 get_narratives.py number_of_clusters batch_id; done
# To run the scripts on a cluster computer, follow the job submission guidelines of your cluster; e.g.,
# bsub -J "get_narr" -W 1380 -n 1 -R "rusage[mem=35000]" python3 get_narratives.py batch_id 1000

# Create three dataframes (all narratives, all complete narratives, all complete narratives with frequency > 50)
# These dataframes also contain the underlying sentences and basic metadata
# The resulting csv-files are provided in /data/gpo_final_data/*.csv
# For the main part of the paper, we specified number_of_clusters = 1000
python3 build_analysis_data_simple.py 1000

# For our human validation exercise, we specified number_of_clusters = 100 / 500 / 1000 / 2000
# python3 build_analysis_data_simple.py 100
# python3 build_analysis_data_simple.py 500
# python3 build_analysis_data_simple.py 2000

# We do not recommend running the following scripts, since they require human input (validation exercise)
# First, draw a sample for each cluster number specification
# python3 human_validation_cluster_get_sample.py 100
# python3 human_validation_cluster_get_sample.py 500
# python3 human_validation_cluster_get_sample.py 1000
# python3 human_validation_cluster_get_sample.py 2000
# Then, create a form for human annotation that combines these samples
# python3 human_validation_cluster_create_form.py
# After the manual annotations, plot the results
#
# python3 human_validation_cluster_plot_results.py narrativefilter
# python3 human_validation_cluster_plot_results.py entityfilter

# Compute sentence compound sentiment scores with NLTK VADER
python3 get_sentence_sentiments.py

# Entity clusters are automatically labeled by the most frequent phrase in the cluster -- this usually performs well
# In some cases, however, manual relabeling provides more interpretable entity clusters and narratives
# We inspect the cluster labels with their associated phrases and re-label them manually when necessary
python3 inspect_and_label_clusters.py
python3 add_manual_cluster_labels.py

# Enrich the final dataframe with sentence the sentiment scores and the topics information
python2 build_analysis_data_rich.py

# Plot summary statistics on the corpus before and after dimension reduction
python3 summary_statistics.py
